To compare the performance of varKode to Skmer, we will use leave-one-out cross validation: we remove one sample from the dataset, train a varKode model or make a skmer reference with the remaining samples, and then use the sample left out as query. We then record whether or not we correctly identify this sample in varKoder, and whether or not the closest sample with Skmer has the same identification.
For traditional barcodes, we assembled the genome of each sample, and then used BLAST to search for each of the traditional barcode genes. We recorded if we could find this gene in the assembly, coding as missing data if we could not. We then recorded whether the best BLAST hit for a sample was the correct species.
#rm(list=ls())
library(tidyverse)
── Attaching core tidyverse packages ───────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ─────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(future)
library(ggthemes)
library(patchwork)
library(cowplot)
Attaching package: ‘cowplot’
The following object is masked from ‘package:patchwork’:
align_plots
The following object is masked from ‘package:ggthemes’:
theme_map
The following object is masked from ‘package:lubridate’:
stamp
library(patchwork)
library(phytools)
Loading required package: ape
Attaching package: ‘ape’
The following object is masked from ‘package:dplyr’:
where
Loading required package: maps
Attaching package: ‘maps’
The following object is masked from ‘package:purrr’:
map
library(ape)
library(furrr)
set.seed(14164)
#function to prevent overwriting plots
safe_ggsave <- function(filename, ...) {
if (file.exists(filename)) {
warning(paste("File", filename, "already exists. File not overwritten."))
return(invisible(NULL))
}
ggsave(filename, ...)
}
For VarKoder, we used leave-one-out cross-validation to test the accuracy for family, genera, species in the joint Malpighiaceae-Chrysobalanaceae dataset. We used as input data varKodes produced from kmers of size 7 and 500Kbp to 200Mbp of data, or all of the data available if less than 200 Mbp. For each sample, we built a model using as input data from all other samples. Then we queried the sample left out, using as input the images generated from 500Kb to the total data available. Now we will summarize the results.
In this test, we used varKoder v0.8.0. Let’s process the results.
read_and_process_xval = function(infolder){
plan(multisession(workers = 12))
varkoder_results = list.files(infolder,
'predictions.csv',
recursive=T,
full.names = T) %>%
furrr::future_map_dfr(~read_csv(.x) %>% mutate(sample_id = as.character(sample_id))) %>%
select(-1) %>%
mutate(query_basepairs = case_when(
is.character(query_basepairs) ~ as.numeric(str_remove(query_basepairs, "K")) * 1000,
TRUE ~ as.numeric(query_basepairs)
)) %>%
filter(str_detect(format(query_basepairs, scientific = FALSE,trim = TRUE), "^[125]0+$")) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
plan(sequential)
all_taxlabels = str_remove(varkoder_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varkoder_results = varkoder_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels[str_detect(query_labels,'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels,'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(query_labels,'species')),
query_labels[str_detect(query_labels,'species')] %in% predicted_list,
NA
),
family_incorrect = any(!(predicted_list[str_detect(predicted_list,'family')] %in% query_labels[str_detect(query_labels,'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list,'genus')] %in% query_labels[str_detect(query_labels,'genus')])),
species_incorrect = ifelse(any(str_detect(query_labels,'species')),
any(!(predicted_list[str_detect(predicted_list,'species')] %in% query_labels[str_detect(query_labels,'species')])),
NA
)
)
return(varkoder_results)
}
summarize_results = function(res,level){
res = res %>%
ungroup() %>%
mutate(low_quality = str_detect(actual_labels,"low_quality:True"),
result = as.character(ifelse(res[,str_c(level,'correct',sep='_')] & !res[,str_c(level,'incorrect',sep='_')], 'correct',
ifelse(res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'ambiguous',
ifelse(!res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'incorrect',
'inconclusive'
))))
) %>%
filter(!is.na(result)) %>%
group_by(query_bp,result) %>%
summarise(N=n(), .groups = 'drop') %>%
group_by(query_bp) %>%
mutate(p= N/sum(N)) %>%
#mutate(query_bp = as.integer(str_remove(query_bp,'K'))*1000) %>%
ungroup() %>%
mutate(query_bp = as.factor(query_bp)) %>%
complete(query_bp,result, fill = list(p = 0, N = 0)) %>%
mutate(query_bp = as.numeric(as.character(query_bp))) %>%
ungroup()
return(res)
}
plot_area = function(sum_df, title, relative = FALSE, grid = TRUE, xlim_all = TRUE, wrap){
breaks = c(500000,
1000000,
2000000,
5000000,
10000000,
20000000,
50000000,
100000000,
200000000
)
if (xlim_all){
xlimits = range(breaks)
} else {
xlimits = range(sum_df$query_bp)
}
sum_df = sum_df %>%
mutate(result = factor(result,ordered = T, levels = c('correct','ambiguous','inconclusive','incorrect')))
if (relative){
ylimits = c(0,1)
} else {
ylimits = c(0,sum_df %>% group_by(query_bp) %>% summarize(N=sum(N)) %>% pull(N) %>% max)
}
# Get colors from a Color Brewer palette
brewer_colors <- RColorBrewer::brewer.pal(4, "Accent")
if (relative) {
p1 = ggplot(sum_df, aes(x=query_bp,y=p,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Fraction of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
} else {
p1 = ggplot(sum_df, aes(x=query_bp,y=N,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
}
if (grid){
p1 = p1 +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
p1 = p1 + coord_cartesian(xlim=xlimits, ylim=ylimits,expand = FALSE)
if (!missing(wrap)) {
p1 = p1 + facet_wrap(as.formula(wrap))
}
return(p1)
}
Now let’s plot genus-level accuracy:
results = read_and_process_xval('additional_tests_cgr/results_cgr_varKoder_vit_large_patch32_224/')
summary_genus = summarize_results(results,'genus')
p_genus = plot_area(summary_genus, 'varKoder genus', relative = TRUE)
p_genus
Now the same but with species
summary_species = summarize_results(results,'species')
p_species = plot_area(summary_species, 'varKoder species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_species
Finally, family
summary_family = summarize_results(results,'family')
p_family = plot_area(summary_family, 'varKoder family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_family
Now we will try to identify which samples failed and why they failed. Particuarly, how do DNA quality, amount of data, and the number of samples per class impact results? We will use genus-level predictions to test.
genus_predictions = results %>%
mutate(predicted_genus = str_extract(predicted_labels, 'genus:[^;]*'),
actual_genus = str_extract(actual_labels, 'genus:[^;]*')) %>%
select(-starts_with('family'),-starts_with('species')) %>%
pivot_longer(cols = starts_with("genus"), names_to = "predicted_label", values_to = "confidence") %>%
filter(actual_genus == predicted_label) %>%
select(query_bp, sample_id, basefrequency_sd, actual_genus, confidence) %>%
mutate(query_bp = 1000*(str_remove(query_bp, "K") %>% as.integer))
genus_predictions = genus_predictions %>%
select(sample_id, actual_genus) %>%
distinct() %>%
group_by(actual_genus) %>%
summarise(N_samples = n()) %>%
right_join(genus_predictions)
Joining with `by = join_by(actual_genus)`
genus_predictions %>% arrange(N_samples)
Now let’s make some plots. First, what is the effect of number of samples per class in confidence?
set.seed(13214526)
plot_genus_N_vs_conf = ggplot(genus_predictions, aes(x = N_samples-1,
y = confidence)) +
scale_color_viridis_c() +
geom_jitter(alpha=0.3) +
scale_x_log10() +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct genus prediction') +
xlab('Number of training samples in correct genus\n(log scale)') +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_N_vs_conf
Now, what is the effect of sample quality in confidence?
set.seed(13214526)
plot_genus_freqsd_vs_conf = ggplot(genus_predictions, aes(x = basefrequency_sd, y = confidence)) +
geom_point(alpha=0.3) +
scale_x_log10() +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct genus prediction') +
xlab('Standard deviation of base frequencies') +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_freqsd_vs_conf
Now, what is the effect of amount of data in confidence?
set.seed(13214526)
plot_genus_bp_vs_conf = ggplot(genus_predictions, aes(x = query_bp, y = confidence)) +
geom_jitter(alpha=0.3) +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct genus prediction') +
xlab('Base pairs in query images\n(log scale)') +
scale_x_log10() +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_bp_vs_conf
Let’s put it all together now in a linear model. We remove one from N_samples since one sample is used for validation in the leave-one-out cross-validation that we did.
lm_data = genus_predictions %>%
mutate(confidence = ifelse(confidence == 1, confidence-0.0000001, confidence),
confidence = car::logit(confidence)) %>%
mutate(query_bp = (query_bp - mean(query_bp))/sd(query_bp),
basefrequency_sd = (basefrequency_sd - mean(basefrequency_sd))/sd(basefrequency_sd),
N_samples = ((N_samples-1) - mean(N_samples-1))/sd(N_samples-1)
)
full_model = lm(formula = confidence~query_bp*basefrequency_sd*N_samples, data = lm_data)
full_model
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Coefficients:
(Intercept) query_bp
4.488259 0.125846
basefrequency_sd N_samples
-0.553064 1.662176
query_bp:basefrequency_sd query_bp:N_samples
0.204618 -0.005384
basefrequency_sd:N_samples query_bp:basefrequency_sd:N_samples
-0.103651 -0.037260
reduced_model = step(full_model,direction = 'both')
Start: AIC=2660.3
confidence ~ query_bp * basefrequency_sd * N_samples
Df Sum of Sq RSS AIC
- query_bp:basefrequency_sd:N_samples 1 0.13619 7282.7 2658.3
<none> 7282.6 2660.3
Step: AIC=2658.35
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd +
query_bp:N_samples + basefrequency_sd:N_samples
Df Sum of Sq RSS AIC
- query_bp:N_samples 1 0.0505 7282.8 2656.4
<none> 7282.7 2658.3
- basefrequency_sd:N_samples 1 6.7172 7289.4 2658.4
+ query_bp:basefrequency_sd:N_samples 1 0.1362 7282.6 2660.3
- query_bp:basefrequency_sd 1 14.7993 7297.5 2660.9
Step: AIC=2656.36
confidence ~ query_bp + basefrequency_sd + N_samples + query_bp:basefrequency_sd +
basefrequency_sd:N_samples
Df Sum of Sq RSS AIC
<none> 7282.8 2656.4
- basefrequency_sd:N_samples 1 6.8663 7289.6 2656.5
+ query_bp:N_samples 1 0.0505 7282.7 2658.3
- query_bp:basefrequency_sd 1 14.8391 7297.6 2659.0
summary(reduced_model)
Call:
lm(formula = confidence ~ query_bp + basefrequency_sd + N_samples +
query_bp:basefrequency_sd + basefrequency_sd:N_samples, data = lm_data)
Residuals:
Min 1Q Median 3Q Max
-9.5644 -0.9384 0.2772 1.1785 5.9867
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.49197 0.04065 110.510 < 2e-16 ***
query_bp 0.13430 0.04627 2.903 0.00373 **
basefrequency_sd -0.54313 0.06556 -8.285 < 2e-16 ***
N_samples 1.66672 0.04017 41.496 < 2e-16 ***
query_bp:basefrequency_sd 0.22549 0.10524 2.143 0.03225 *
basefrequency_sd:N_samples -0.08782 0.06025 -1.457 0.14513
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.798 on 2253 degrees of freedom
Multiple R-squared: 0.5226, Adjusted R-squared: 0.5215
F-statistic: 493.2 on 5 and 2253 DF, p-value: < 2.2e-16
plot(reduced_model)
broom::tidy(reduced_model) %>% write_csv('lm_table.csv')
Now let’s save the three of them as a single plot using cowplot.
preds = genus_predictions %>%
mutate(N_samples=N_samples-1) %>%
select(original_N = N_samples,
original_bp = query_bp,
original_sd = basefrequency_sd) %>%
mutate(query_bp = (original_bp - mean(original_bp))/sd(original_bp),
basefrequency_sd = (original_sd - mean(original_sd))/sd(original_sd),
N_samples = ((original_N) - mean(original_N))/sd(original_N)
)
preds = bind_rows(expand.grid(query_bp=unique(preds$query_bp),
N_samples=0,
basefrequency_sd=0
),
expand.grid(query_bp=0,
N_samples=unique(preds$N_samples),
basefrequency_sd=0
),
expand.grid(query_bp=0,
N_samples=0,
basefrequency_sd=unique(preds$basefrequency_sd)
)
)
logistic <- function(L) {
1 / (1 + exp(-L))
}
preds = predict(reduced_model, newdata=preds, interval='conf') %>%
as_tibble() %>%
mutate_all(~logistic(.x)+0.0000001) %>%
rename(confidence=fit) %>%
bind_cols(preds %>%
mutate(N_samples = N_samples*sd(genus_predictions$N_samples-1)+mean(genus_predictions$N_samples-1),
basefrequency_sd = basefrequency_sd*sd(genus_predictions$basefrequency_sd)+
mean(genus_predictions$basefrequency_sd),
query_bp = query_bp*sd(genus_predictions$query_bp)+mean(genus_predictions$query_bp))
)
combined_conf = patchwork::wrap_plots(plot_genus_N_vs_conf +
theme(text = element_text(size=8)) +
geom_line(data=(filter(preds,
N_samples != mean(genus_predictions$N_samples-1))),
color="blue") +
geom_ribbon(data=(filter(preds,
N_samples != mean(genus_predictions$N_samples-1))),
aes(ymin=lwr, ymax=upr),
fill="lightblue",
alpha=0.4),
plot_genus_bp_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8)) +
geom_line(data=(filter(preds,
query_bp != mean(genus_predictions$query_bp))),
color="blue") +
geom_ribbon(data=(filter(preds,
query_bp != mean(genus_predictions$query_bp))),
aes(ymin=lwr, ymax=upr),
fill="lightblue",
alpha=0.4),
plot_genus_freqsd_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8)) +
geom_line(data=(filter(preds,
basefrequency_sd != mean(genus_predictions$basefrequency_sd))),
color="blue") +
geom_ribbon(data=(filter(preds,
basefrequency_sd != mean(genus_predictions$basefrequency_sd))),
aes(ymin=lwr, ymax=upr),
fill="lightblue",
alpha=0.4)) +
patchwork::plot_annotation(tag_levels = 'A',
title = 'Factors affecting varKode prediction accuracy',theme = theme(plot.title = element_text(hjust=0.5)))
combined_conf
safe_ggsave (filename = 'images_manuscript/supp_conf_predictors.pdf',device = 'pdf',width = 7,height=3,units = 'in',useDingbats=F)
Warning: File images_manuscript/supp_conf_predictors.pdf already exists. File not overwritten.
For skmer, we left each sample out, built a reference and then queried that sample. We have several files in which reference samples are ordered by their distance to the query, we here we will evaluate whether the closest sample is from the correct species or genus.
Because it is not clear how skmer behaves for different levels of coverage, we repeated this for several input sizes (in number of basepairs) as query, but always used the maximum input dize available (up to 200Mb) for references.
Let’s make a function that extracts these results as a table.
samp_labels = results %>% select(sample_id,actual_labels) %>% distinct()
extract_skmer_results = function(file_path) {
# Read only the first 2 lines of the file
file_lines <- readLines(file_path, n = 2)
# Extract sample_ID, basepairs from the first line
sample_info <- str_match(file_lines[1], "\\s*(.*?)@(\\d+K)")[, 2:3]
sample_ID <- sample_info[1]
basepairs <- sample_info[2]
# Extract reference_sample_ID, distance from the second line
reference_info <- str_match(file_lines[2], "\\s*(.*?)@.*\\s+(\\d+\\.\\d+)")[, 2:3]
reference_sample_ID <- reference_info[1]
distance <- as.numeric(reference_info[2])
# Create a tibble
tibble(
sample_id = sample_ID,
query_bp = basepairs,
closest_reference_sample_id = reference_sample_ID,
closest_distance = distance
)
}
Now we will apply this function to all skmer output files.
plan(multisession(workers = 12))
skmer_results_df = furrr::future_map_dfr(
list.files('Malpighiales/skmer/skmer_xval_results/', full.names = T),
~ extract_skmer_results(.x)
) %>%
left_join(samp_labels, by = 'sample_id') %>%
left_join(
samp_labels %>% select(
closest_reference_sample_id = 'sample_id',
predicted_labels = actual_labels
),
by = 'closest_reference_sample_id'
) %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA),
query_bp = as.numeric(str_remove(query_bp, "K")) * 1000
)
plan(sequential)
skmer_results_df
Now let’s summarize and plot by genus:
skmer_summary_genus = summarize_results(skmer_results_df,'genus')
p_skmer_genus = plot_area(skmer_summary_genus, 'Skmer genus', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_genus
Now by species. In Skmer, there is no inconclusive result: if there is no correct species prediction, it means that a sample was predicted in the wrong genus and therefore it is incorrect
skmer_summary_species = summarize_results(skmer_results_df,'species') %>%
mutate(result = ifelse(result == 'correct', 'correct','incorrect')) %>%
group_by(query_bp,result) %>%
summarise_all(sum)
p_skmer_species = plot_area(skmer_summary_species, 'Skmer species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_species
And now by family:
skmer_summary_family = summarize_results(skmer_results_df,'family')
skmer_summary_family
p_skmer_family = plot_area(skmer_summary_family, 'Skmer family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_family
Let’s now read the traditional barcode BLAST results and summarize them in the same way as skmer and varKoder. Let’s start by defining a fuction that reads the data so we can summarize it using the previously defined functions.
read_traditional_barcodes = function(bp) {
input_file = paste0(
'Malpighiales/traditional_barcodes/2_blast_phylogeny_result/Genus/',
bp,
'M_blast_phylo_sum_sp.tsv'
)
barcode_res = read_delim(input_file) %>%
pivot_longer(-sp, names_to = 'marker', values_to = 'closest_reference_sample_id') %>%
rename(sample_id = 'sp') %>%
mutate(
sample_id = str_remove_all(sample_id, '@.+'),
closest_reference_sample_id = str_remove_all(closest_reference_sample_id, '@.+'),
predicted_labels = samp_labels$actual_labels[match(closest_reference_sample_id, samp_labels$sample_id)],
actual_labels = samp_labels$actual_labels[match(sample_id, samp_labels$sample_id)]
) %>%
filter(marker != 'Concatenated_phylogeny') %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
) %>%
mutate_at(vars(ends_with("_correct"), ends_with("_incorrect")),
~ ifelse(is.na(predicted_labels) & !is.na(.), FALSE, .)) %>%
mutate(query_bp = bp * 1e6)
return(barcode_res)
}
Now we can apply this function to all of our results:
results_barcodes = purrr::map_dfr(c(0.5,1,2,5,10,20,50,100,200),read_traditional_barcodes)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 285 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 267 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 200 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 166 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
results_barcodes
Now let’s summarise for each marker separately:
barcode_summary_family = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'family'),.id='marker')
barcode_summary_family
barcode_summary_genus = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'genus'),.id='marker')
barcode_summary_genus
barcode_summary_species = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'species'),.id='marker')
barcode_summary_species
Now let’s plot, making separate plots for each marker:
Species:
p_barcode_species = barcode_summary_species %>%
split(barcode_summary_species$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' species'), relative = TRUE, xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_species
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Genera:
p_barcode_genus = barcode_summary_genus %>%
split(barcode_summary_genus$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' genus'), relative = TRUE, xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_genus
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Family:
p_barcode_family = barcode_summary_family %>%
split(barcode_summary_family$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' family'), relative = TRUE,xlim_all = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_family
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Now we will do the same for concatenated tree. Let’s start by defining a function to gather results. We will consider a result as correct if the majority of the sister taxon to a tip has the same label.
read_concatenated_tree_results = function(bp){
# Read in your tree - replace 'your_tree_file.nwk' with the path to your tree file
tree = read.tree(paste0('Malpighiales/traditional_barcodes/2_blast_phylogeny_result/Genus/conc.',bp,'m.spname.tre'))
#leave only sample IDs as tip labels
tree$tip.label = tree$tip.label %>% str_remove(".*@") %>% str_remove("'") %>% str_replace(' ref','_ref')
# Compute the patristic distances and list all reference names
patristic_distances <- cophenetic(tree)
all_ref_names = dimnames(patristic_distances)[[1]][str_detect(dimnames(patristic_distances)[[1]],'_ref$')]
all_nonref = dimnames(patristic_distances)[[1]][str_detect(dimnames(patristic_distances)[[1]],'_ref$',negate = TRUE)]
# For each tip, find the reference sample with closest patristic distance
find_closest = function(tip){
to_keep = c(tip,all_ref_names[str_detect(all_ref_names,paste0(tip,'_ref'),negate = TRUE)])
return(names(sort(patristic_distances[tip,to_keep])[2]) %>%
str_remove('_ref'))
}
closest_match = purrr::map_chr(all_nonref,find_closest)
samples_with_data = read_delim(paste0('Malpighiales/traditional_barcodes/2_blast_phylogeny_result/Genus/',bp,'M_blast_phylo_sum_sp.tsv')) %>%
select(sample_id=sp) %>%
mutate(sample_id = str_remove_all(sample_id, '@.+'))
barcode_res = tibble(sample_id = all_nonref,
closest_reference_sample_id = closest_match) %>%
right_join(samples_with_data) %>%
mutate(
predicted_labels = samp_labels$actual_labels[match(closest_reference_sample_id, samp_labels$sample_id)],
actual_labels = samp_labels$actual_labels[match(sample_id, samp_labels$sample_id)]
) %>%
filter(sample_id!='2095') %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
) %>%
mutate_at(vars(ends_with("_correct"), ends_with("_incorrect")),
~ ifelse(is.na(predicted_labels) & !is.na(.), FALSE, .)) %>%
mutate(query_bp = bp * 1e6)
return(barcode_res)
}
Now let’s apply this function
results_concat_barcodes = purrr::map_dfr(c(0.5,1,2,5,10,20,50,100,200),read_concatenated_tree_results)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (6): sp, matK, rbcL, ndhF, trnL-F, ITS
lgl (1): Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 288 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 285 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 267 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 200 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Rows: 166 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`
results_concat_barcodes
Let’s summarize results and plot for genus, species and family accuracy
concat_summary_species = summarize_results(results_concat_barcodes,'species')
p_concat_species = plot_area(concat_summary_species, relative = FALSE,title = 'Concatenated barcodes species',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_species
concat_summary_genus = summarize_results(results_concat_barcodes,'genus')
p_concat_genus = plot_area(concat_summary_genus, relative = TRUE,title = 'Concatenated barcodes genus',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_genus
concat_summary_family = summarize_results(results_concat_barcodes,'family')
p_concat_family = plot_area(concat_summary_family, relative = TRUE,title = 'Concatenated barcodes family',xlim_all = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_concat_family
Now let’s compare methods side by side. For genus level:
p = patchwork::wrap_plots(p_genus + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_skmer_genus + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_genus$ITS + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_genus$rbcL + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_concat_genus,
ncol = 1) +
plot_annotation(title = 'Genus-level accuracy')
p
safe_ggsave ('images_manuscript/fig3_genus_accuracy.pdf', width=5,height = 10)
Warning: File images_manuscript/fig3_genus_accuracy.pdf already exists. File not overwritten.
safe_ggsave ('images_manuscript/fig3_genus_accuracy.png', width=5,height = 10,dpi=1200)
Warning: File images_manuscript/fig3_genus_accuracy.png already exists. File not overwritten.
Now for species level:
p = patchwork::wrap_plots(p_species + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_skmer_species + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_species$ITS + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_species$rbcL + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_concat_species,
ncol = 1) +
plot_annotation(title = 'species-level accuracy')
p
safe_ggsave ('images_manuscript/fig3_species_accuracy.pdf', width=5,height = 10)
Warning: File images_manuscript/fig3_species_accuracy.pdf already exists. File not overwritten.
safe_ggsave ('images_manuscript/fig3_species_accuracy.png', width=5,height = 10,dpi=1200)
Warning: File images_manuscript/fig3_species_accuracy.png already exists. File not overwritten.
Now for family level:
p = patchwork::wrap_plots(p_family + ggtitle('varKoder') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none'),
p_skmer_family + ggtitle('Skmer') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none'),
p_barcode_family$ITS + ggtitle('ITS') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank()),
p_barcode_family$rbcL + ggtitle('rbcL') + theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'none'),
p_concat_family + ggtitle('Concatenated conventional barcodes') + theme(legend.position = 'none'),
ncol = 1,guides = 'collect') +
plot_annotation(title = 'Family-level accuracy',theme = theme(plot.title=element_text(hjust=0.5)))
p
safe_ggsave ('images_manuscript/fig3_family_accuracy.pdf', width=5,height = 10)
Warning: File images_manuscript/fig3_family_accuracy.pdf already exists. File not overwritten.
safe_ggsave ('images_manuscript/fig3_family_accuracy.png', width=5,height = 10,dpi=1200)
Warning: File images_manuscript/fig3_family_accuracy.png already exists. File not overwritten.
Now let’s plot a figure for the other traditional barcode loci that did not make this list.
p1 = patchwork::wrap_plots(ggplot() +
theme_minimal() +
ggtitle("Species") +
theme(plot.title = element_text(hjust=0.5),
plot.margin = unit(c(0,0,0,0),'lines')),
p_barcode_species$matK +
ggtitle('matK') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
legend.position = 'none'),
p_barcode_species$ndhF +
ggtitle('ndhF') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
legend.position = 'none'),
p_barcode_species$`trnL-F` +
ggtitle('trnL-F') +
theme(axis.title.y =element_blank(),
legend.position = 'none',
axis.title.x =element_blank()),
ncol = 1,
heights = c(1,15,15,15)
)
p2 = patchwork::wrap_plots(ggplot() +
theme_minimal() +
ggtitle("Genus") +
theme(plot.title = element_text(hjust=0.5),
plot.margin = unit(c(0,0,0,0),'lines')),
p_barcode_genus$matK +
ggtitle('matK') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_genus$ndhF +
ggtitle('ndhF') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_genus$`trnL-F` +
ggtitle('trnL-F') +
theme(axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none',
axis.title.x =element_blank()),
ncol = 1,
heights = c(1,15,15,15))
p3 = patchwork::wrap_plots(ggplot() +
theme_minimal() +
ggtitle("Family") +
theme(plot.title = element_text(hjust=0.5),
plot.margin = unit(c(0,0,0,0),'lines')),
p_barcode_family$matK +
ggtitle('matK') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_family$ndhF +
ggtitle('ndhF') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y =element_blank(),
axis.text.y =element_blank(),
legend.position = 'none'),
p_barcode_family$`trnL-F` +
ggtitle('trnL-F') +
theme(axis.title.y =element_blank(),
axis.text.y =element_blank(),
axis.title.x =element_blank()),
ncol = 1,
heights = c(1,15,15,15),
guides='collect')
p = patchwork::wrap_plots(p1,p2,p3,ncol=3,guides="collect") +
plot_annotation(
title = 'Conventional barcode accuracy across different taxonomic levels',
theme = theme(plot.title = element_text(hjust = 0.2,face='bold',size=15))
)
safe_ggsave ('images_manuscript/supp_traditional_barcodes.pdf', plot = p, width=8,height = 6)
Warning: File images_manuscript/supp_traditional_barcodes.pdf already exists. File not overwritten.
Finally, let’s summarize results for the whole SRA dataset. In this case, we only have varKoder since Skmer cannot finish and traditional barcodes are inapplicable. We have done predictions separately for families included in the training set and families not included in the trainings set, so we will load each one and concatenate
varKoder_SRA_results = read_csv('all_SRA_eukaryote_families/vkfCGR_query_results/predictions.csv') %>%
select(-1) %>%
mutate(query_basepairs = case_when(
is.character(query_basepairs) ~ as.numeric(str_remove(query_basepairs, "K")) * 1000,
TRUE ~ as.numeric(query_basepairs)
)) %>%
filter(str_detect(format(query_basepairs, scientific = FALSE,trim = TRUE), "^[125]0+$")) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
Rows: 8607 Columns: 12── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (6): varKode_image_path, sample_id, query_mapping, trained_model_path, pr...
dbl (5): query_basepairs, query_kmer_len, actual_labels, basefrequency_sd, pr...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plan(sequential)
varKoder_SRA_results = varKoder_SRA_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels))),
family_in_training = TRUE) %>%
select(matches("^[^0-9]"))
varKoder_SRA_results
NA
varKoder_SRA_results_notincluded = read_csv('all_SRA_eukaryote_families/vkfCGR_query_notincluded_results/predictions.csv') %>%
select(-1) %>%
mutate(query_basepairs = case_when(
is.character(query_basepairs) ~ as.numeric(str_remove(query_basepairs, "K")) * 1000,
TRUE ~ as.numeric(query_basepairs)
)) %>%
filter(str_detect(format(query_basepairs, scientific = FALSE,trim = TRUE), "^[125]0+$")) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
Rows: 8439 Columns: 12── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (6): varKode_image_path, sample_id, query_mapping, trained_model_path, pr...
dbl (5): query_basepairs, query_kmer_len, actual_labels, basefrequency_sd, pr...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plan(sequential)
SRA_taxlabels_notincluded = str_remove(varKoder_SRA_results_notincluded$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varKoder_SRA_results_notincluded = varKoder_SRA_results_notincluded %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels))),
family_in_training = FALSE) %>%
select(matches("^[^0-9]"))
varKoder_SRA_results_notincluded
Now let’s summarize and plot:
SRA_summary_family = bind_rows(summarize_results(varKoder_SRA_results,'family') %>% mutate(family_in_training = TRUE),
summarize_results(varKoder_SRA_results_notincluded,'family') %>% mutate(family_in_training = FALSE))
SRA_summary_family
N_samp = SRA_summary_family %>%
group_by(query_bp, family_in_training) %>%
summarise(N = sum(N))
`summarise()` has grouped output by 'query_bp'. You can override using the `.groups` argument.
p_SRA_family = plot_area(SRA_summary_family, 'varKoder SRA family', relative = TRUE,xlim_all = FALSE, wrap = '~family_in_training')
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_SRA_family
Let’s now do the SRA plot, but splitting by kingdom and whether or not family was included in training. First, we need to retrieve kingdom information:
summary_SRA_by_kingdom = read_csv('all_SRA_eukaryote_families/runs_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results) %>%
split(.$Kingdom) %>%
purrr::map_df(summarize_results,
level='family',
.id='Kingdom'
) %>%
mutate(Kingdom = factor(Kingdom,levels=c('Metazoa','Viridiplantae','Fungi'),ordered = T),
family_in_training = T) %>%
bind_rows(read_csv('all_SRA_eukaryote_families/runs_notincluded_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results_notincluded) %>%
split(.$Kingdom) %>%
purrr::map_df(summarize_results,
level='family',
.id='Kingdom'
) %>%
mutate(Kingdom = factor(Kingdom,levels=c('Metazoa','Viridiplantae','Fungi'),ordered = T),
family_in_training = F)) %>%
mutate(family_in_training = c('Family\nnot in training set', 'Family\nin training set')[family_in_training+1])
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 8264 Columns: 51── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (28): Run, AssemblyName, download_path, Experiment, LibraryName, Library...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, In...
lgl (10): g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disease, Aff...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 1697 Columns: 51── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (27): Run, download_path, Experiment, LibraryName, LibraryStrategy, Libr...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, In...
lgl (11): AssemblyName, g1k_pop_code, source, g1k_analysis_group, Subject_ID...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`
summary_SRA_by_kingdom
p_SRA_families = plot_area(summary_SRA_by_kingdom ,
relative=FALSE,
xlim_all = FALSE,
title='Eukaryote families') +
facet_grid(family_in_training~Kingdom) +
coord_cartesian(xlim=c(500,10000)*1000,expand = FALSE) +
theme(text = element_text(size=10))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
print(p_SRA_families)
safe_ggsave ('images_manuscript/fig3_SRA_accuracy.pdf', width=5,height = 4)
Warning: File images_manuscript/fig3_SRA_accuracy.pdf already exists. File not overwritten.
safe_ggsave ('images_manuscript/fig3_SRA_accuracy.png', width=5,height = 4,dpi = 1200)
Warning: File images_manuscript/fig3_SRA_accuracy.png already exists. File not overwritten.
We repeated this analysis with varKodes. Let’s compare now.
plan(sequential)
varKoder_SRA_results_vk = read_csv('all_SRA_eukaryote_families/varkoder_query_results/predictions.csv',
col_types = list(query_basepairs = "c")) %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0*[125]0+K$')) %>%
mutate(query_bp = as.numeric(str_remove(query_basepairs,'K'))*1000)
New names:
varKoder_SRA_results_vk = varKoder_SRA_results_vk %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels))),
family_in_training = TRUE) %>%
select(matches("^[^0-9]"))
varKoder_SRA_results_notincluded_vk = read_csv('all_SRA_eukaryote_families/varkoder_query_notincluded_results/predictions.csv',
col_types = list(query_basepairs = "c")) %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0*[125]0+K$')) %>%
mutate(query_bp = as.numeric(str_remove(query_basepairs,'K'))*1000)
New names:
SRA_taxlabels_notincluded_vk = str_remove(varKoder_SRA_results_vk$actual_labels,";*low_quality:True;*") %>%
str_split(';') %>%
unlist %>%
unique
varKoder_SRA_results_notincluded_vk = varKoder_SRA_results_notincluded_vk %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels))),
family_in_training = FALSE) %>%
select(matches("^[^0-9]"))
# Summary statistics
SRA_summary_family_vk = bind_rows(
summarize_results(varKoder_SRA_results_vk,'family') %>% mutate(family_in_training = TRUE),
summarize_results(varKoder_SRA_results_notincluded_vk,'family') %>% mutate(family_in_training = FALSE)
)
N_samp_vk = SRA_summary_family_vk %>%
group_by(query_bp, family_in_training) %>%
summarise(N = sum(N))
`summarise()` has grouped output by 'query_bp'. You can override using the `.groups` argument.
# Kingdom summary
summary_SRA_by_kingdom_vk = read_csv('all_SRA_eukaryote_families/runs_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results_vk) %>%
split(.$Kingdom) %>%
purrr::map_df(summarize_results,
level='family',
.id='Kingdom') %>%
mutate(Kingdom = factor(Kingdom,levels=c('Metazoa','Viridiplantae','Fungi'),ordered = TRUE),
family_in_training = TRUE) %>%
bind_rows(
read_csv('all_SRA_eukaryote_families/runs_notincluded_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results_notincluded_vk) %>%
split(.$Kingdom) %>%
purrr::map_df(summarize_results,
level='family',
.id='Kingdom') %>%
mutate(Kingdom = factor(Kingdom,levels=c('Metazoa','Viridiplantae','Fungi'),ordered = TRUE),
family_in_training = FALSE)
) %>%
mutate(family_in_training = c('Family\nnot in training set', 'Family\nin training set')[family_in_training+1])
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 8264 Columns: 51── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (28): Run, AssemblyName, download_path, Experiment, LibraryName, Library...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, In...
lgl (10): g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disease, Aff...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 1697 Columns: 51── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (27): Run, download_path, Experiment, LibraryName, LibraryStrategy, Libr...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, In...
lgl (11): AssemblyName, g1k_pop_code, source, g1k_analysis_group, Subject_ID...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`
# Final plot
p_SRA_families_vk = plot_area(summary_SRA_by_kingdom_vk,
relative=FALSE,
xlim_all = FALSE,
title='Eukaryote families') +
facet_grid(family_in_training~Kingdom) +
coord_cartesian(xlim=c(500,10000)*1000,expand = FALSE) +
theme(text = element_text(size=10))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Coordinate system already present. Adding new coordinate system, which will
replace the existing one.
p_SRA_families_vk
The graph looks almost identical, let’s make a table comparing side-by-side. It seems rfCGRs are slightly better.
full_join(
select(summary_SRA_by_kingdom_vk,Kingdom,query_bp,result,p,family_in_training),
select(summary_SRA_by_kingdom,,Kingdom,query_bp,result,p,family_in_training),
by=join_by(Kingdom,query_bp,result,family_in_training),
suffix = c('.varKode','.rfCGR')
) %>%
mutate(difference = p.varKode-`p.rfCGR`) %>%
arrange(family_in_training,result,query_bp,Kingdom)
Now we will make a small figure to include the additional datasets in which we applied varKoding.
In these cases, we chose a test set that included both taxa in the
training set and taxa not in the training set, so we will graph both
separately. This is denoted by a column named
in_training_model. Let’s start by reading results.
Let’s define a function to read and process predictions:
read_and_process_others = function(infile){
varkoder_results = read_csv(infile) %>%
mutate(sample_id = as.character(sample_id)) %>%
select(-1) %>%
rename(query_bp = query_basepairs)
all_taxlabels = str_remove(varkoder_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varkoder_results = varkoder_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(taxon_correct = any(query_labels %in% predicted_list),
taxon_incorrect = any(!(predicted_list[!is.na(predicted_list)] %in% query_labels))
)
return(varkoder_results)
}
Now let’s apply this function to all files.
prediction_files = list.files('other_datasets/varKode/',pattern = 'predictions.+csv',full.names = T,recursive = T)
names(prediction_files) = basename(prediction_files) %>% str_extract(".*(?=_predictions\\.csv)")
other_results = purrr::map_dfr(prediction_files, read_and_process_others, .id='dataset')
Rows: 18 Columns: 16── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_trai...
dbl (8): Bembidion, a, basefrequency_sd, ampliatum, breve, lividulum, saturat...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 18 Columns: 16── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_trai...
dbl (8): Corallorhiza, prediction_threshold, basefrequency_sd, Corallorhiza b...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 25 Columns: 16── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_trai...
dbl (8): Mycobacterium tuberculosis, prediction_threshold, basefrequency_sd, ...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 15 Columns: 16── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (7): sample_id, query_basepairs, query_kmer_len, prediction_type, in_trai...
dbl (8): Xanthoparmelia, prediction_threshold, basefrequency_sd, camtschadali...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
other_results
Let’s now summarize by dataset and separately for taxa included and excluded from the training set.
summary_others = other_results %>%
split(interaction(other_results$dataset, other_results$in_training_model)) %>%
purrr::map_dfr(summarize_results, level = 'taxon', .id = 'comb') %>%
separate(comb, into = c("dataset", "taxon_in_training_raw"), sep = "\\.") %>%
mutate(taxon_in_training = taxon_in_training_raw == 'yes') %>%
select(-taxon_in_training_raw) %>%
mutate(taxon_in_training = c('Taxon not in training set', 'Taxon in training set')[taxon_in_training+1],
dataset = str_replace(dataset, "^(.)", ~toupper(.x))) %>%
mutate(result = factor(result,
levels=c("correct", "ambiguous", "inconclusive", "incorrect"),
ordered=T))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercionWarning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercionWarning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercionWarning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercionWarning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercionWarning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercionWarning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercionWarning: There was 1 warning in `mutate()`.
ℹ In argument: `query_bp = as.numeric(as.character(query_bp))`.
Caused by warning:
! NAs introduced by coercion
summary_others
NA
Now let’s plot
p_others = ggplot(summary_others , aes(x = dataset, y = N, fill = result)) +
geom_col()+
scale_fill_manual(values = setNames(RColorBrewer::brewer.pal(4, "Accent"), c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
ggtitle('Other datasets') +
ylab('Number of samples') +
xlab('Taxon') +
theme_few() +
scale_y_continuous(minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
axis.title.x = element_blank(),
axis.text.x = element_text(face='italic'),
panel.ontop = TRUE) +
coord_cartesian(expand=FALSE) +
facet_grid(taxon_in_training~.) +
theme(text = element_text(size=10),
axis.text.x = element_text(angle = 30,hjust = 1))
p_others
Let’s repeat everything for CGR representation now
# Read prediction files
prediction_files_CGR = list.files('other_datasets/cgr/',pattern = 'predictions.+csv',full.names = TRUE,recursive = T)
names(prediction_files_CGR) = basename(prediction_files_CGR) %>% str_extract(".*(?=_predictions\\.csv)")
other_results_CGR = purrr::map_dfr(prediction_files_CGR, read_and_process_others, .id='dataset')
Rows: 18 Columns: 18── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (8): varKode_image_path, sample_id, in_training_model, query_mapping, tra...
dbl (9): query_basepairs, query_kmer_len, basefrequency_sd, prediction_thresh...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 18 Columns: 18── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (8): varKode_image_path, sample_id, in_training_model, query_mapping, tra...
dbl (9): query_basepairs, query_kmer_len, basefrequency_sd, prediction_thresh...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 25 Columns: 18── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (8): varKode_image_path, sample_id, in_training_model, query_mapping, tra...
dbl (9): query_basepairs, query_kmer_len, basefrequency_sd, prediction_thresh...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 15 Columns: 18── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (8): varKode_image_path, sample_id, in_training_model, query_mapping, tra...
dbl (9): query_basepairs, query_kmer_len, basefrequency_sd, prediction_thresh...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create summary
summary_others_CGR = other_results_CGR %>%
split(interaction(other_results_CGR$dataset, other_results_CGR$in_training_model)) %>%
purrr::map_dfr(summarize_results, level = 'taxon', .id = 'comb') %>%
separate(comb, into = c("dataset", "taxon_in_training_raw"), sep = "\\.") %>%
mutate(taxon_in_training = taxon_in_training_raw == 'yes') %>%
select(-taxon_in_training_raw) %>%
mutate(taxon_in_training = c('Taxon not in training set', 'Taxon in training set')[taxon_in_training+1],
dataset = str_replace(dataset, "^(.)", ~toupper(.x))) %>%
mutate(result = factor(result,
levels=c("correct", "ambiguous", "inconclusive", "incorrect"),
ordered=TRUE))
# Create plot
p_others_CGR = ggplot(summary_others_CGR, aes(x = dataset, y = N, fill = result)) +
geom_col() +
scale_fill_manual(values = setNames(RColorBrewer::brewer.pal(4, "Accent"),
c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
ggtitle('Other datasets') +
ylab('Number of samples') +
xlab('Taxon') +
theme_few() +
scale_y_continuous(minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
axis.title.x = element_blank(),
axis.text.x = element_text(face='italic'),
panel.ontop = TRUE) +
coord_cartesian(expand=FALSE) +
facet_grid(taxon_in_training~.) +
theme(text = element_text(size=10),
axis.text.x = element_text(angle = 30,hjust = 1))
p_others_CGR
safe_ggsave ('images_manuscript/fig3_others_accuracy.pdf', width=3.5,height = 3)
Warning: File images_manuscript/fig3_others_accuracy.pdf already exists. File not overwritten.
safe_ggsave ('images_manuscript/fig3_others_accuracy.png', width=3.5,height = 3,dpi = 1200)
Warning: File images_manuscript/fig3_others_accuracy.png already exists. File not overwritten.
This function takes a string of semicolon-separated key-value pairs and converts it into a structured dataframe. It splits the input string on semicolons, then further splits each pair on colons to separate keys and values. The result is a two-column dataframe with ‘key’ and ‘value’ columns, providing an organized representation of the label data.
unpack_labels = function(x){
# Split the input string by ';'
kvs = strsplit(x, ';')[[1]]
# Split each key-value pair by ':'
kvs_split = strsplit(kvs, ':')
# Create a dataframe with columns "key" and "value"
df = data.frame(
key = sapply(kvs_split, `[`, 1),
value = sapply(kvs_split, `[`, 2),
stringsAsFactors = FALSE
)
return(df)
}
This function compares two sets of labels (actual and predicted) and generates a summary of their agreement. It processes all unique keys found in both sets, excluding certain taxonomy-related keys, and includes a special ‘Taxonomy_all’ key. For each key, it calculates the number of actual and predicted values, true positives (TP), false negatives (FN), and false positives (FP). The output is a dataframe summarizing these metrics for each key, enabling detailed analysis of prediction accuracy across different label types.
summarize_comparison <- function(actual, predicted) {
# Ensure the key columns are of type character to avoid factor level issues
actual$key <- as.character(actual$key)
predicted$key <- as.character(predicted$key)
# Get the unique keys from both actual and predicted, excluding specific keys
all_keys <- unique(c(actual$key, predicted$key))
filtered_keys <- setdiff(all_keys, c("Taxonomy_no_rank", "Taxonomy_clade"))
# Add "Taxonomy_all" key to the list of keys
if (!"Taxonomy_all" %in% filtered_keys) {
filtered_keys <- c(filtered_keys, "Taxonomy_all")
}
# Initialize a result dataframe
result <- data.frame(
key = filtered_keys,
N_actual = integer(length(filtered_keys)),
N_predicted = integer(length(filtered_keys)),
TP = integer(length(filtered_keys)),
FN = integer(length(filtered_keys)),
FP = integer(length(filtered_keys)),
stringsAsFactors = FALSE
)
for (key in filtered_keys) {
if (key == "Taxonomy_all") {
actual_values <- actual$value[grepl("^Taxonomy_", actual$key)]
predicted_values <- predicted$value[grepl("^Taxonomy_", predicted$key)]
} else {
actual_values <- actual$value[actual$key == key]
predicted_values <- predicted$value[predicted$key == key]
}
N_actual <- length(actual_values)
N_predicted <- length(predicted_values)
TP <- sum(actual_values %in% predicted_values)
FN <- sum(!actual_values %in% predicted_values)
FP <- sum(!predicted_values %in% actual_values)
result[result$key == key, ] <- list(key, N_actual, N_predicted, TP, FN, FP)
}
return(result)
}
This function performs parallel processing on a dataframe to extract and structure label information. It uses multiple cores to speed up processing of large datasets. For each row, it unpacks the actual and predicted labels, extracts library strategy and platform information, and structures this data into a new dataframe. The result is a processed dataframe with unpacked label information and additional metadata, optimized for further analysis.
process_dataframe_parallel <- function(df, num_cores = 16) {
# Set up parallel processing
plan(multisession, workers = num_cores)
processed_df <- df %>%
split(1:nrow(.)) %>% # Split the dataframe into a list of rows
future_map_dfr(~ {
actual = unpack_labels(.x$actual_labels)
predicted = unpack_labels(.x$predicted_labels)
library_strategy = actual %>%
filter(key == 'LibraryStrategy') %>%
pull(value)
platform = actual %>%
filter(key == 'Platform') %>%
pull(value)
tibble(
sample_id = .x$sample_id,
query_basepairs = .x$query_basepairs,
query_mapping = .x$query_mapping,
actual = list(actual),
predicted = list(predicted),
library_strategy = library_strategy,
platform = platform
)
}, .options = furrr_options(seed = TRUE))
# Close the parallel backend
plan(sequential)
return(processed_df)
}
This function applies the summarize_comparison function to an entire dataframe of predictions. It processes each row, comparing actual and predicted labels and collecting metadata like sample ID, query base pairs, and mapping information. The function then combines all these individual summaries into a single, comprehensive dataframe. This resulting dataframe provides a detailed overview of prediction accuracy across all samples, including various metadata for each comparison.
collate_results <- function(predictions_df) {
predictions_df = predictions_df %>%
select(sample_id, query_basepairs, query_mapping, actual, predicted, platform, library_strategy)
# Apply the summarize_comparison function to each row
results_list <- pmap(predictions_df, function(sample_id, query_basepairs, query_mapping, actual, predicted, platform, library_strategy) {
result <- summarize_comparison(actual, predicted)
result$sample_id <- sample_id
result$query_basepairs <- query_basepairs
result$query_mapping <- query_mapping
result$platform = platform
result$library_strategy = library_strategy
return(result)
})
# Combine all results into a single dataframe
results_df <- bind_rows(results_list)
return(results_df)
}
Read data
vk_df = read_csv('all_SRA_taxa/results_varKodes/predictions.csv') %>% process_dataframe_parallel
Rows: 152015 Columns: 12── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (7): varKode_image_path, sample_id, query_mapping, trained_model_path, ac...
dbl (4): query_basepairs, query_kmer_len, basefrequency_sd, prediction_threshold
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cgr_df = read_csv('all_SRA_taxa/results_cgrs/predictions.csv') %>% process_dataframe_parallel
Rows: 152015 Columns: 12── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (7): varKode_image_path, sample_id, query_mapping, trained_model_path, ac...
dbl (4): query_basepairs, query_kmer_len, basefrequency_sd, prediction_threshold
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cgr_df
vk_df
vk_cgr_df = bind_rows(vk_df, cgr_df)
summaries = collate_results(vk_cgr_df)
summaries
Now let’s calculate metrics:
summaries_with_scores <- summaries %>%
group_by(key, query_basepairs, query_mapping, platform, library_strategy) %>%
summarize(
total_TP = sum(TP),
total_FN = sum(FN),
total_FP = sum(FP),
N = n(),
.groups = 'drop'
) %>%
mutate(
microprecision = total_TP / (total_TP + total_FP),
microrecall = total_TP / (total_TP + total_FN),
F1 = 2 * (microprecision * microrecall) / (microprecision + microrecall)
) %>%
replace_na(list(microprecision = 0, microrecall = 0, F1 = 0)) %>%
select(query_basepairs, query_mapping, key, F1, microprecision, microrecall, everything()) %>%
filter(
grepl("^[125]0{1,}$", format(query_basepairs, scientific = FALSE, trim = TRUE))
)
summaries_with_scores
plot_df = summaries_with_scores %>%
rename(
precision = microprecision,
recall = microrecall
) %>%
pivot_longer(
cols = c(F1, precision, recall),
names_to = "metric",
values_to = "metric_value"
) %>%
filter(key %in% c("LibraryStrategy", "Platform", "Taxonomy_all", "Taxonomy_family", "Taxonomy_genus", "Taxonomy_species")) %>%
mutate(query_mapping = ifelse(query_mapping=='cgr','rfCGR',query_mapping))
This graph shows how well the two methods behave to recognize the library strategy:
brks = c(500000,
1000000,
2000000,
5000000,
10000000,
20000000,
50000000,
100000000,
200000000
)
plot_df_strat = plot_df %>% filter(key %in% c("LibraryStrategy"))
Ns <- plot_df_strat %>%
group_by(platform, library_strategy) %>%
summarize(N = max(N), .groups = "drop") %>%
mutate(N = paste0("N = ", N))
ggplot(plot_df_strat,
aes(x=query_basepairs, y=metric_value, color=metric,linetype=query_mapping,group=interaction(metric,query_mapping))) +
geom_line() +
facet_grid(platform~library_strategy) +
labs(title="Accuracy in predicting the library strategy (GBS, RAD or WGS)",
x="Metric value",
y="Base pairs in query images",
color="Metric",
linetype="K-mer Mapping") +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = brks) +
scale_y_continuous(limits=c(0,1), breaks=c(0,0.2,0.4,0.6,0.8,1)) +
theme_minimal() +
theme(text = element_text(size=10),
axis.text.x = element_text(hjust=1,angle=45)) +
geom_label(data = Ns, aes(label = N, x = 12000000, y = 0.1),
hjust = 0.5, vjust = 0.5,
label.padding = unit(0.2, "lines"),
label.size = 0.2,
size=2,
color = "black",
fill = "white",
inherit.aes = FALSE)
safe_ggsave ('images_manuscript/supp_strat_prediction.pdf',width=7,height = 7,useDingbats=F)
Warning: File images_manuscript/supp_strat_prediction.pdf already exists. File not overwritten.
This graph shows how well the two methods behave to recognize the sequencing platform:
ggplot(plot_df %>% filter(key %in% c("Platform")),
aes(x=query_basepairs, y=metric_value, color=metric,linetype=query_mapping,group=interaction(metric,query_mapping,key))) +
geom_line() +
facet_grid(platform~library_strategy) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = brks) +
scale_y_continuous(limits=c(0,1), breaks=c(0,0.2,0.4,0.6,0.8,1)) +
labs(title="Accuracy in predicting sequencing platform",
x="Metric value",
y="Base pairs in query images",
color="Metric",
linetype="K-mer Mapping") +
theme_minimal() +
theme(text = element_text(size=10),
axis.text.x = element_text(hjust=1,angle=45)) +
geom_label(data = Ns, aes(label = N, x = 12000000, y = 0.1),
hjust = 0.5, vjust = 0.5,
label.padding = unit(0.2, "lines"),
label.size = 0.2,
size=2,
color = "black",
fill = "white",
inherit.aes = FALSE)
safe_ggsave ('images_manuscript/supp_platform_prediction.pdf',width=7,height = 7,useDingbats=F)
Warning: File images_manuscript/supp_platform_prediction.pdf already exists. File not overwritten.
This graph shows how well the two methods behave for different taxonomic levels:
tax_labels = c("all ranks", "family", "genus", "species")
names(tax_labels) = c("Taxonomy_all", "Taxonomy_family", "Taxonomy_genus", "Taxonomy_species")
plot_df_tax = plot_df %>%
filter(key %in% c("Taxonomy_all", "Taxonomy_family", "Taxonomy_genus", "Taxonomy_species")) %>%
mutate(key = tax_labels[key],
platform = ifelse(platform=='OXFORD_NANOPORE','NANOPORE',platform))
Ns <- plot_df_tax %>%
group_by(platform, library_strategy,key) %>%
summarize(N = max(N), .groups = "drop") %>%
mutate(N = paste0("N = ", N))
ggplot(plot_df_tax,
aes(x=query_basepairs,
y=metric_value, color=metric,
linetype=query_mapping,
group=interaction(metric,query_mapping,key))) +
geom_line() +
facet_grid(key~platform+library_strategy) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = brks) +
scale_y_continuous(limits=c(0,1), breaks=c(0,0.2,0.4,0.6,0.8,1)) +
geom_label(data = Ns, aes(label = N, x = 7000000, y = 0.1),
hjust = 0.5, vjust = 0.5,
label.padding = unit(0.2, "lines"),
label.size = 0.2,
size=2,
color = "gray30" ,
fill = "gray96",
inherit.aes = FALSE) +
labs(title="Accuracy in predicting taxonomy",
y="Metric value",
x="Base pairs in query images",
color="Metric",
linetype="K-mer Mapping") +
theme_minimal() +
theme(text = element_text(size=7),
axis.text.x = element_text(hjust=1,angle=45),
legend.position = 'bottom')
For the manuscript, let’s simplify and only show rfCGR:
ggplot(plot_df_tax %>% filter(query_mapping=='rfCGR',metric != "F1"),
aes(x=query_basepairs,
y=metric_value, color=metric,
group=metric)) +
geom_line() +
facet_grid(key~platform+library_strategy) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = brks) +
scale_y_continuous(limits=c(0,1), breaks=c(0,0.2,0.4,0.6,0.8,1)) +
geom_label(data = Ns, aes(label = N, x = 7000000, y = 0.1),
hjust = 0.5, vjust = 0.5,
label.padding = unit(0.2, "lines"),
label.size = 0.2,
size=2,
color = "gray30" ,
fill = "gray96",
inherit.aes = FALSE) +
labs(title="Accuracy in predicting taxonomy",
y="Metric value",
x="Base pairs in query images",
color="Metric",
linetype="K-mer Mapping") +
theme_minimal() +
theme(text = element_text(size=7),
axis.text.x = element_text(hjust=1,angle=45),
legend.position = 'bottom')
safe_ggsave ('images_manuscript/fig_X_taxonomy_prediction.pdf',width=7,height = 5,useDingbats=F)
Warning: File images_manuscript/fig_X_taxonomy_prediction.pdf already exists. File not overwritten.
Let’s now try to understand the relationship between number of samples available for a label and its accuracy. For each label, we will summarize the N in training set, precision, recall and F1 score in the validation set. Then we will plot the relationship between number of training samples and validation metrics.
summarize_comparison_with_value <- function(actual, predicted) {
actual <- actual %>% mutate(k_v = paste(key, value, sep = ":"))
predicted <- predicted %>% mutate( k_v = paste(key, value, sep = ":"))
# Get the unique keys from both actual and predicted, excluding specific keys
all_kv <- unique(c(actual$k_v, predicted$k_v))
# Initialize a result dataframe
result <- data.frame(
k_v = all_kv,
N_actual = integer(length(all_kv)),
N_predicted = integer(length(all_kv)),
TP = integer(length(all_kv)),
FN = integer(length(all_kv)),
FP = integer(length(all_kv)),
stringsAsFactors = FALSE
)
for (k_v in all_kv) {
actual_values <- actual$value[actual$k_v == k_v]
predicted_values <- predicted$value[predicted$k_v == k_v]
N_actual <- length(actual_values)
N_predicted <- length(predicted_values)
TP <- sum(actual_values %in% predicted_values)
FN <- sum(!actual_values %in% predicted_values)
FP <- sum(!predicted_values %in% actual_values)
result[result$k_v == k_v, ] <- list(k_v, N_actual, N_predicted, TP, FN, FP)
}
return(result)
}
collate_results_with_value <- function(predictions_df) {
predictions_df = predictions_df %>%
select(sample_id, query_basepairs, query_mapping, actual, predicted, platform, library_strategy)
# Apply the summarize_comparison function to each row
results_list <- pmap(predictions_df, function(sample_id, query_basepairs, query_mapping, actual, predicted, platform, library_strategy) {
result <- summarize_comparison_with_value(actual, predicted)
result$sample_id <- sample_id
result$query_basepairs <- query_basepairs
result$query_mapping <- query_mapping
result$platform = platform
result$library_strategy = library_strategy
return(result)
})
# Combine all results into a single dataframe
results_df <- bind_rows(results_list)
return(results_df)
}
summaries_kv = collate_results_with_value(vk_cgr_df)
summaries_kv
metrics_by_label = summaries_kv %>%
select(-sample_id,-query_basepairs) %>%
group_by(k_v) %>%
summarise(across(where(is.numeric), sum)) %>%
mutate(
microprecision = TP / (TP + FP),
microrecall = TP / (TP + FN),
F1 = 2 * (microprecision * microrecall) / (microprecision + microrecall)
) %>%
replace_na(list(microprecision = 0, microrecall = 0, F1 = 0))
metrics_by_label
Now we only need to count the occurrences of each label in the training set
metrics_by_label = read_csv('all_SRA_taxa/labels_ML_training.csv') %>%
filter(!sample %in% vk_cgr_df$sample_id) %>%
pull(labels) %>%
str_split(';') %>%
unlist %>%
table %>%
as_tibble(column_name='k_v',n="N_training") %>%
rename(k_v = ".") %>%
right_join(metrics_by_label) %>%
pivot_longer(microprecision:F1,names_to = 'metric',values_to = 'value') %>%
arrange(N_training)
Rows: 230644 Columns: 2── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (2): sample, labels
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(k_v)`
metrics_by_label
metrics_summary <- metrics_by_label %>%
filter(metric !='F1') %>%
# Create log-scaled bins
mutate(bin = cut(log10(N_training), breaks = 30)) %>%
group_by(bin, metric) %>%
summarise(
median = median(value),
q25 = quantile(value, 0.25),
q75 = quantile(value, 0.75),
# Get middle of the bin for plotting
x = median(N_training),
N=n()
)
`summarise()` has grouped output by 'bin'. You can override using the `.groups` argument.
metrics_summary
ggplot() +
geom_bin2d(data = metrics_by_label %>% filter(metric !='F1'), aes(x = N_training, y = value)) +
#geom_ribbon(data = metrics_summary,
# aes(x = x, ymin = q25, ymax = q75),
# alpha = 0.2,
# fill = "blue") +
geom_line(data = metrics_summary,
aes(x = x, y = median),
color = "orange",
linewidth = 1) +
#geom_point(data = metrics_by_label %>% filter(metric !='F1'),
# aes(x = N_training, y = value),
# alpha = 0.1) +
scale_x_log10() +
scale_fill_viridis_c(trans="log",name="Label counts",breaks=c(1,3,10,30,100,300,1000)) +
facet_wrap(~metric, labeller = labeller(metric=c('microprecision'='Precision','microrecall'='Recall'))) +
theme_minimal() +
labs(
x = "Number of Training Examples (log scale)",
y = "Value",
title = "Validation Metrics by Training Set Size"
)
safe_ggsave ('images_manuscript/fig_X_n_train.pdf',width=7,height = 2.5,useDingbats=F)
Warning: File images_manuscript/fig_X_n_train.pdf already exists. File not overwritten.
Let’s now get some numbers for publication. What is the average precision at species level?
plot_df %>%
filter(key=='Taxonomy_species',
metric %in% c('precision','recall'),
query_mapping=='varKode') %>%
group_by(metric,platform) %>%
summarise(mean_value=mean(metric_value))
`summarise()` has grouped output by 'metric'. You can override using the `.groups` argument.
NA
Genus level
plot_df %>%
filter(key=='Taxonomy_genus',
metric %in% c('precision','recall'),
query_mapping=='varKode') %>%
group_by(metric,platform) %>%
summarise(mean_value=mean(metric_value))
`summarise()` has grouped output by 'metric'. You can override using the `.groups` argument.
NA
Family level
plot_df %>%
filter(key=='Taxonomy_family',
metric %in% c('precision','recall'),
query_mapping=='varKode') %>%
group_by(metric,platform) %>%
summarise(mean_value=mean(metric_value))
`summarise()` has grouped output by 'metric'. You can override using the `.groups` argument.
All taxonomy
plot_df %>%
filter(key=='Taxonomy_all',
metric %in% c('precision','recall'),
query_mapping=='varKode') %>%
group_by(metric,platform) %>%
summarise(mean_value=mean(metric_value))
`summarise()` has grouped output by 'metric'. You can override using the `.groups` argument.
Here we just query our results to get a few figures that we report in the paper.
Total number of samples used in cross-validation:
dim(samp_labels)
[1] 287 2
Number of Stigmaphyllon samples with each kind of error for varkoder:
summary_species
Number of Stigmaphyllon samples with each kind of error for skmer:
skmer_summary_species
Traditional barcode accuracy for species:
barcode_summary_species %>% arrange(query_bp,marker)
Concatenated barcode accuract for species:
concat_summary_species
varKoder accuracy for genera:
summary_genus
varKoder accuracy for family:
summary_family
Skmer accuracy for genera:
skmer_summary_genus
Skmer accuracy for family:
skmer_summary_family
Number of samples available for each genus and data amount
results %>%
mutate(genus = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(query_bp) %>%
summarize(N=n()) %>%
complete()
Plot number of samples for supplementary material.
n_samples_genera = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0)) %>%
mutate(taxon = fct_reorder(taxon, N))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_genera
n_samples_species = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=species:)[^;]+")) %>%
filter(!is.na(taxon)) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0)) %>%
mutate(taxon = fct_reorder(taxon, N))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_species
For SRA eukaryotes, we have to count both validation and training samples, since we did not do cross-validation. Let’s use image names to get the information and then the results table to figure out which ones were in the validation set.
all_files = c(list.files('all_SRA_eukaryote_families/varkoder_images_SRA/',pattern='*.png',recursive = T),
list.files('all_SRA_eukaryote_familiesvarkoder_query_images/',pattern='*.png',recursive = T))
n_samples_SRA = data.frame(filename=all_files) %>%
mutate(
sample_id = str_extract(filename, "^(.+)(?=@)"), # Capture everything up to the "@" symbol but exclude the symbol itself
query_bp = str_extract(filename, "(?<=@)([0-9]+)K") # multiply by 1000 to convert K to the actual number
) %>%
left_join(read_csv('all_SRA_eukaryote_families/runs_to_download_data.csv') %>%
select(sample_id=Run,Kingdom,taxon=FamilyID)) %>%
mutate_at(vars(taxon),as.character) %>%
mutate(validation_set = sample_id %in% varKoder_SRA_results$sample_id) %>%
group_by(taxon, query_bp,validation_set) %>%
summarize(N=n()) %>%
ungroup() %>%
mutate(taxon = fct_reorder(taxon, N))
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)Rows: 8264 Columns: 51── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (28): Run, AssemblyName, download_path, Experiment, LibraryName, Library...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertSize, In...
lgl (10): g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disease, Aff...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)``summarise()` has grouped output by 'taxon', 'query_bp'. You can override using the `.groups` argument.
n_samples_SRA
((list.files('all_SRA_eukaryote_familiesvarkoder_images_SRA/',pattern='*.png',recursive = T) %>%
str_extract("^(.+)(?=@)"))%in%
varKoder_SRA_results$sample_id) %>% summary
Mode
logical
plot_Nsamples_area = function(df, title){
df = df #%>%
# mutate(query_bp = parse_number(query_bp) *1000)
n_levels <- length(unique(df$taxon))
viridis_colors <- viridis::turbo(n_levels)
half_n <- ceiling(n_levels / 2)
reordered_colors <- c(rbind(viridis_colors[1:half_n], viridis_colors[(half_n + 1):n_levels]))
ggplot(df, aes(x=query_bp,y=N,fill=taxon, color = taxon, group = taxon)) +
geom_area(position= position_stack()) +
#geom_line(position='stack') +
scale_fill_manual(values = reordered_colors,
aesthetics = c('colour','fill'),
guide = 'none') +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),
breaks = unique(n_samples_genera$query_bp),
limits = range(unique(n_samples_genera$query_bp))) +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45),
panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
N_species = plot_Nsamples_area(n_samples_species, title = expression(italic('Stigmaphyllon')~'species')) + theme(axis.title.x = element_blank(),text = element_text(size=8))
N_genera = plot_Nsamples_area(n_samples_genera, title = 'Malpighiales genera') + theme(axis.title.x = element_blank(),text = element_text(size=8))
p = plot_grid(N_species, N_genera, nrow = 1)
# Add common plot title with white background
common_title = ggdraw() + draw_label("Number of samples available for different data amounts", fontface = 'bold', x = 0.5, hjust = 0.5) + theme(plot.background = element_rect(fill = "white", color = "white"), plot.margin = unit(c(0, 0, 0, 0), "null"))
p = plot_grid(common_title, p, ncol = 1, rel_heights = c(0.1, 1))
# Add common X-axis title with white background
x_axis_title = ggdraw() + draw_label("Post-cleaning base pairs available", x = 0.5, hjust = 0.5, vjust = 1) + theme(plot.background = element_rect(fill = "white", color = "white"), plot.margin = unit(c(-1, 0, 0, 0), "lines"))
p = plot_grid(p, x_axis_title, ncol = 1, rel_heights = c(1, 0.1))
print(p)
safe_ggsave ('images_manuscript/supp_fig_n_samples.pdf', width=8,height = 4)
Warning: File images_manuscript/supp_fig_n_samples.pdf already exists. File not overwritten.
safe_ggsave ('images_manuscript/supp_fig_n_samples.png', width=8,height = 4,dpi = 1200)
Warning: File images_manuscript/supp_fig_n_samples.png already exists. File not overwritten.
Total number of SRA eukaryote family samples. Validation:
read_csv('all_SRA_eukaryote_families/varkoder_trained_model_ML/input_data.csv')[-1] %>%
group_by(is_valid) %>%
summarise(N = n())
New names:Rows: 41103 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (2): sample, path
dbl (3): ...1, bp, labels
lgl (2): possible_low_quality, is_valid
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
For SRA all taxa, let’s summarize the number of samples for each label. Let’s start by parsing the data. We will start with train_valid_sets.csv but then filter to the samples actually used. A few might have failed to produce varKodes.
plan(multisession, workers = 10)
all_SRA_samples = read_csv("all_SRA_taxa/train_valid_sets.csv") %>%
split(1:nrow(.)) %>% # Split the dataframe into a list of rows
future_map_dfr( ~ {
actual = unpack_labels(.x$labels)
library_strategy = actual %>%
filter(key == 'LibraryStrategy') %>%
pull(value)
platform = actual %>%
filter(key == 'Platform') %>%
pull(value)
tibble(
sample_id = .x$Run,
labels = list(actual),
library_strategy = library_strategy,
platform = platform,
is_valid = .x$is_valid
)
}, .options = furrr_options(seed = TRUE)
)
Rows: 255159 Columns: 5── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Run, labels
dbl (2): spots, AveSpotsTo20Mbp
lgl (1): is_valid
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plan(sequential)
actual_tr = read_csv("all_SRA_taxa/varkodes_ViT_ML/input_data.csv") %>% filter(is_valid==FALSE) %>% pull(sample) %>% unique
Rows: 1374558 Columns: 7── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (4): sample, img_kmer_mapping, path, labels
dbl (2): bp, img_kmer_size
lgl (1): is_valid
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
actual_vd = read_csv("all_SRA_taxa/results_varKodes/predictions.csv") %>% pull(sample_id) %>% unique
Rows: 152015 Columns: 12── Column specification ─────────────────────────────────────────────────────────
Delimiter: ","
chr (7): varKode_image_path, sample_id, query_mapping, trained_model_path, ac...
dbl (4): query_basepairs, query_kmer_len, basefrequency_sd, prediction_threshold
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_SRA_samples = all_SRA_samples%>% filter(sample_id %in% c(actual_tr,actual_vd))
all_SRA_samples
NA
Now let’s count the number of samples for each label
N_summary = all_SRA_samples %>%
unnest(labels) %>%
rename(label_key = key, label_value = value) %>%
group_by(is_valid,library_strategy,platform,label_key,label_value) %>%
summarize(N_samples = n(), .groups = "drop") %>%
filter(label_key %in% c('Taxonomy_family','Taxonomy_species'))
N_summary
Now, for each combination of library strategy, platform and whether sample is validation, we will summarize the range, mean and median number of samples per label.
N_per_label_key = N_summary %>%
group_by(is_valid, platform, library_strategy, label_key) %>%
summarise(
n_labels = n(),
min = min(N_samples, na.rm = TRUE),
q1 = quantile(N_samples, 0.25, na.rm = TRUE),
median = median(N_samples, na.rm = TRUE),
mean = mean(N_samples, na.rm = TRUE),
q3 = quantile(N_samples, 0.75, na.rm = TRUE),
max = max(N_samples, na.rm = TRUE)
) %>%
ungroup()
`summarise()` has grouped output by 'is_valid', 'platform', 'library_strategy'. You can override using the `.groups` argument.
N_per_label_key
dir.create('tables_manuscript',showWarnings = F)
write_csv(N_per_label_key, 'tables_manuscript/summary_samples_all_SRA.csv')
Let’s now count the total number of unique taxonomy labels:
N_summary %>%
filter(str_detect(label_key,'^Taxonomy')) %>%
pull(label_value) %>%
unique %>%
length()
[1] 14151
And the number of unique accessions:
all_SRA_samples$sample_id %>% unique %>% length()
[1] 254819
Pooled across sequencing methods for families and all
summaries_with_scores %>% filter(key %in% c('Taxonomy_family','Taxonomy_all')) %>% group_by(key) %>%
summarise(min_pr=min(microprecision),
max_pr=max(microprecision),
min_rec=min(microrecall),
max_rec=max(microrecall))
Discriminating sequencing methods for genera and species
summaries_with_scores %>%
filter(key %in% c('Taxonomy_genus','Taxonomy_species'),
query_basepairs == 10000000) %>%
arrange(key,desc(microprecision))
Now let’s compare averages across mapping for species level.
summaries_with_scores %>%
filter(key %in% c('Taxonomy_species')) %>%
group_by(query_mapping) %>%
summarise(mean(microprecision), mean(microrecall))
Calculate micro-averaged precision and recall.
calculate_precision_recall = function(results, taxonomic_level=NULL) {
# Function to filter labels by taxonomic level
filter_labels <- function(labels_list, level) {
if (is.null(level)) {
return(labels_list)
} else {
return(grep(paste0("^", level, ":"), labels_list, value = TRUE))
}
}
# Filter rows and labels for a given taxonomic level
filter_rows_and_labels <- function(results, level) {
if (is.null(level)) {
return(results)
} else {
# Keep only rows where the level is found in query_labels
filtered_results <- results[sapply(results$query_labels, function(x) {
any(grepl(paste0("^", level, ":"), x))
}), ]
# Filter labels in both query_labels and predicted_list
filtered_results$query_labels <- lapply(filtered_results$query_labels, filter_labels, level)
filtered_results$predicted_list <- lapply(filtered_results$predicted_list, filter_labels, level)
return(filtered_results)
}
}
# Apply filtering
filtered_results <- filter_rows_and_labels(results, taxonomic_level)
# Initialize counters for true positives, false positives, and false negatives
total_true_positives <- 0
total_false_positives <- 0
total_false_negatives <- 0
# Process each row in the filtered results
for (i in seq_len(nrow(filtered_results))) {
query_labels <- filtered_results$query_labels[[i]]
predicted_labels <- filtered_results$predicted_list[[i]]
true_positives <- sum(predicted_labels %in% query_labels)
false_positives <- sum(!predicted_labels %in% query_labels & !is.na(predicted_labels) & predicted_labels != "")
false_negatives <- sum(!query_labels %in% predicted_labels & !is.na(query_labels) & query_labels != "")
# Update aggregate counts
total_true_positives <- total_true_positives + true_positives
total_false_positives <- total_false_positives + false_positives
total_false_negatives <- total_false_negatives + false_negatives
}
# Calculate micro-averaged precision and recall
micro_precision <- ifelse((total_true_positives + total_false_positives) > 0,
total_true_positives / (total_true_positives + total_false_positives),
NA_real_)
micro_recall <- ifelse((total_true_positives + total_false_negatives) > 0,
total_true_positives / (total_true_positives + total_false_negatives),
NA_real_)
return(tibble(taxonomic_level = taxonomic_level, micro_precision = micro_precision, micro_recall = micro_recall))
}
Precision and recall for species:
filter(results,str_detect(actual_labels,'species')) %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'species'),.id='query_bp')
Precision and recall for genera:
filter(results,str_detect(actual_labels,'genus')) %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'genus'),.id='query_bp')
Precision and recall for families:
filter(results,str_detect(actual_labels,'family')) %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'family'),.id='query_bp')
Precision and recall for SRA eukaryote families, rfCGR representation:
pr_rc_euk_SRA = varKoder_SRA_results %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x),.id='query_bp') %>%
mutate(query_bp = 1000*(str_remove(query_bp,'K') %>% as.numeric))
pr_rc_euk_SRA
Precision and recall for SRA eukaryote families, varKode representation:
pr_rc_euk_SRA_vk = varKoder_SRA_results_vk %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x),.id='query_bp') %>%
mutate(query_bp = 1000*as.integer(query_bp))
pr_rc_euk_SRA_vk
Let’s join both tables:
full_join(pr_rc_euk_SRA,pr_rc_euk_SRA_vk,by="query_bp",suffix = c('.varKode','.rfCGR'))
Precision and recall for other taxa, varKode representation:
other_results %>%
filter(in_training_model == 'yes') %>%
split(.$dataset) %>%
map_dfr(~calculate_precision_recall(.x),.id='dataset')
Precision and recall for other taxa, CGR representation:
other_results_CGR %>%
filter(in_training_model == 'yes') %>%
split(.$dataset) %>%
map_dfr(~calculate_precision_recall(.x),.id='dataset')
Numbers correct and icorrect, CGR:
summary_others_CGR %>% group_by(dataset,result,taxon_in_training) %>% summarise(N=sum(N),p=mean(p))
`summarise()` has grouped output by 'dataset', 'result'. You can override using the `.groups` argument.
Precision and recall for conventional barcodes at species level
results_barcodes %>%
group_by(marker,query_bp) %>%
nest() %>%
mutate(precision_recall = purrr::map(data, calculate_precision_recall,'species')) %>%
select(-data) %>%
unnest(precision_recall)
Precision and recall for concatenated conventional barcodes at species level
results_concat_barcodes %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x),.id='query_bp','species')
Precision/recall for skmer at species level. Precision and recall are not always the same because in some cases skmer predicted the wrong genus, and, therefore, there was no species prediction.
skmer_results_df %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'species'),.id='query_bp')
Precision and recall for conventional barcodes at genus level
results_barcodes %>%
group_by(marker,query_bp) %>%
nest() %>%
mutate(precision_recall = purrr::map(data, calculate_precision_recall,'genus')) %>%
select(-data) %>%
unnest(precision_recall)
Precision and recall for concatenated conventional barcodes at genus level
results_concat_barcodes %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x),.id='query_bp','genus')
Precision/recall for skmer at genus level:
skmer_results_df %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'genus'),.id='query_bp')
Precision and recall for conventional barcodes at family level
results_barcodes %>%
group_by(marker,query_bp) %>%
nest() %>%
mutate(precision_recall = purrr::map(data, calculate_precision_recall,'family')) %>%
select(-data) %>%
unnest(precision_recall)
Precision and recall for concatenated conventional barcodes at family level
results_concat_barcodes %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x),.id='query_bp','family')
Precision/recall for skmer at family level:
skmer_results_df %>%
split(.$query_bp) %>%
map_dfr(~calculate_precision_recall(.x,'family'),.id='query_bp')